[1] "2.34.1"
modelling workflow for risk and safety
prepared for the Health & Safety Executive, Science Division
13 Feb 2025
BUGS/JAGS to StanStan, which tunes hyperparameters during warmup!BUGS/JAGS to StanHoffman, M.D., Gelman, A., ‘The No-U-Turn Sampler’, JMLR, 2014
BUGS/JAGS to StanStanCon anyone?
‘democratising scalable UQ’ …once installed 🤔
looking for evidence of active corrosion growth
3×5 DataFrame
Row │ anomaly_id soil_type inspection measured_depth_mm T
│ Int64 String1 String3 Float64 Int64
─────┼─────────────────────────────────────────────────────────────
1 │ 1 A i_0 0.0672091 0
2 │ 2 A i_0 0.104202 0
3 │ 3 A i_0 0.100588 0
data {
int<lower=0> n_anomalies; // number of anomalies
int<lower=0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower=0> mu; // mean growth rate
real<lower=0> sigma; // standard deviation
}
model {
// model
for (i in 1 : (n_anomalies * n_inspections)) {
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
delta_C ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
vector[n_anomalies * n_inspections] cgr_pred;
//vector[n_anomalies * n_inspections] log_lik;
for (i in 1 : (n_anomalies * n_inspections)) {
cgr_pred[i] = normal_rng(mu, sigma);
//log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
}
INFO:cmdstanpy:found newer exe file, not recompiling
data {
int <lower = 0> n_anomalies; // number of anomalies
int <lower = 0> n_inspections; // number of inspections
vector[n_anomalies * n_inspections] cgr; // growth rate observations (2 per anomaly)
}
parameters {
real<lower = 0> mu; // mean growth rate
real<lower = 0> sigma; // standard deviation
}
model {
// model
for(i in 1:(n_anomalies * n_inspections)){
cgr[i] ~ normal(mu, sigma);
}
/*
//alternative (vectorised) implementation:
delta_C ~ normal(mu, sigma);
//some suggested priors
mu ~ normal(1/4, 3);
sigma ~ exponential(1);
*/
}
generated quantities {
vector[n_anomalies * n_inspections] cgr_pred;
//vector[n_anomalies * n_inspections] log_lik;
for (i in 1:(n_anomalies * n_inspections)) {
cgr_pred[i] = normal_rng(mu, sigma);
//log_lik[i] = normal_lpdf(delta_C[i] | mu, sigma);
}
}
CmdStanR needs it’s input data as a list
prepare_data <- function(df = corrosion_data) {
df |>
arrange(anomaly_id, T) |>
group_by(anomaly_id) |>
mutate(
next_depth = lead(measured_depth_mm),
time_diff = lead(T) - T
) |>
filter(!is.na(next_depth)) |>
mutate(
delta_C = (next_depth - measured_depth_mm) / time_diff
) |>
select(anomaly_id, delta_C, soil_type) |>
ungroup()
}
model_data <- list(
n_anomalies = prepare_data()$anomaly_id |> unique() |> length(),
n_inspections = 2,
cgr = prepare_data()$delta_C
)
cgr_post <- cgr_model$sample(data = model_data)Running MCMC with 4 sequential chains...
Chain 1 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 1 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 1 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 1 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 1 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 1 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 1 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 1 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 1 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 1 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 1 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1 finished in 0.1 seconds.
Chain 2 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 2 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 2 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 2 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 2 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 2 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 2 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 2 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 2 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 2 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 2 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2 finished in 0.1 seconds.
Chain 3 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 3 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 3 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 3 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 3 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 3 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 3 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 3 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 3 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 3 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 3 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3 finished in 0.1 seconds.
Chain 4 Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4 Iteration: 100 / 2000 [ 5%] (Warmup)
Chain 4 Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4 Iteration: 300 / 2000 [ 15%] (Warmup)
Chain 4 Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4 Iteration: 500 / 2000 [ 25%] (Warmup)
Chain 4 Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4 Iteration: 700 / 2000 [ 35%] (Warmup)
Chain 4 Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4 Iteration: 900 / 2000 [ 45%] (Warmup)
Chain 4 Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4 Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4 Iteration: 1100 / 2000 [ 55%] (Sampling)
Chain 4 Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4 Iteration: 1300 / 2000 [ 65%] (Sampling)
Chain 4 Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4 Iteration: 1500 / 2000 [ 75%] (Sampling)
Chain 4 Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4 Iteration: 1700 / 2000 [ 85%] (Sampling)
Chain 4 Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4 Iteration: 1900 / 2000 [ 95%] (Sampling)
Chain 4 Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4 finished in 0.1 seconds.
All 4 chains finished successfully.
Mean chain execution time: 0.1 seconds.
Total execution time: 0.8 seconds.
CmdStanPy needs it’s input data as a dictionary
def prepare_data(df = corrosion_data):
return (
df.sort(['anomaly_id', 'T'])
.group_by('anomaly_id')
.agg([
pl.col('measured_depth_mm').shift(-1).alias('next_depth'),
pl.col('T').shift(-1).alias('next_time'),
pl.col('measured_depth_mm'),
pl.col('T')
])
.filter(pl.col('next_depth').is_not_null())
.with_columns([
((pl.col('next_depth') - pl.col('measured_depth_mm')) /
(pl.col('next_time') - pl.col('T'))).alias('delta_C')
])
.select(['anomaly_id', 'delta_C'])
.explode('delta_C') # Add this line to unnest the lists
.filter(pl.col('delta_C').is_not_null()) # Optional: remove null values if any
)
model_data = {
'n_anomalies': prepare_data().select('anomaly_id').unique().height,
'n_inspections': 2,
'cgr': prepare_data().select('delta_C').to_series().to_numpy()
}
cgr_post = cgr_model.sample(data = model_data)
INFO:cmdstanpy:CmdStan start processing
chain 1 |[33m [0m| 00:00 Status
chain 2 |[33m [0m| 00:00 Status[A
chain 3 |[33m [0m| 00:00 Status[A[A
chain 4 |[33m [0m| 00:00 Status[A[A[A
chain 1 |[34m#######2 [0m| 00:00 Iteration: 1300 / 2000 [ 65%] (Sampling)
chain 3 |[34m######8 [0m| 00:00 Iteration: 1200 / 2000 [ 60%] (Sampling)[A[A
chain 2 |[34m######8 [0m| 00:00 Iteration: 1200 / 2000 [ 60%] (Sampling)[A
chain 4 |[34m######8 [0m| 00:00 Iteration: 1200 / 2000 [ 60%] (Sampling)[A[A[A
chain 1 |[34m##########[0m| 00:00 Sampling completed
chain 2 |[34m##########[0m| 00:00 Sampling completed
chain 3 |[34m##########[0m| 00:00 Sampling completed
chain 4 |[34m##########[0m| 00:00 Sampling completed
INFO:cmdstanpy:CmdStan done processing.
A Turing model needs it’s input data as arguments to the model function
function prepare_data(df::DataFrame = corrosion_data)
sorted_df = sort(df, [:anomaly_id, :T]); result = DataFrame()
for group in groupby(sorted_df, :anomaly_id)
if nrow(group) > 1
for i in 1:(nrow(group)-1)
Δc = (group[i+1, :measured_depth_mm] - group[i, :measured_depth_mm]) / (group[i+1, :T] - group[i, :T])
push!(result, (
anomaly_id = group[i, :anomaly_id], Δc = max(0, Δc)
))
end
end
end
return result
end
cgr_post = prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, NUTS(), 1_000)success
next up: how we can extend this to a robust and helpful workflow
☕
Turing.jln_draws = 1_000; n_chains = 4
# a no U-turn sampler, with 2000 adaptive steps and a target acceptance rate of 0.65
NUTS_sampler = NUTS(2_000, 0.65)
# a Hamiltonian Monte Carlo sampler, with a step size of 0.05 and 10 leapfrog steps
HMC_sampler = HMC(0.05, 10)
# a Metropolis-Hastings sampler, using the default proposal distribution (priors)
MH_sampler = MH()
# a 'compositional' Gibbs sampler (Metropolis within Gibbs) - sampling μ with MH and σ with NUTS
Gibbs_sampler = Gibbs(MH(:μ), NUTS(2_000, 0.65, :σ))
run_mcmc = function(sampler)
return prepare_data().Δc |>
data -> corrosion_growth(data) |>
model -> sample(model, sampler, MCMCThreads(), n_draws, n_chains)
end